library(sigminer)
library(NMF)
library(ggplot2)
library(ggpubr)
library(tidyverse)
library(survival)
library(ggpubr)
library(survminer)
library(rstatix)
library(ezcox)
library(SummarizedExperiment)
library(TCGAbiolinks)
library(maftools)library(GenomicRanges)
ncore = 1
sigminer_prad_sequenza<-read_copynumber(input ="/xdisk/mpadi/jiawenyang/data/centrosome_loss/sigminer/PC_CNA_signature/data/CNV_from_sequenza.tsv",
genome_build = "hg38",
complement = FALSE,
verbose = T)
sigminer_prad_sequenza.tally.W<-sig_tally(sigminer_prad_sequenza, method = "W", core = ncore, feature_setting = CN.features)
CNV_sequenza_sigminer_prad_matrix<-sigminer_prad_sequenza.tally.W$nmf_matrixdata_sequenza<-c("/xdisk/mpadi/jiawenyang/result/centrosome_loss/sequenza_gc1000/graph/")
FILES_sequenza<-list.files(path = data_sequenza,
pattern = "*.segments.txt$",
recursive = T,
full.names = T)
cnv_centrosome_loss_sequenza<-data.frame()
for (i in 1:length(FILES_sequenza)){
file_df = read.table(FILES_sequenza[i], header = T)
sample_id<-substr(strsplit(strsplit(FILES_sequenza[i], "/")[[1]][10], "[.]")[[1]][1], 1, nchar(strsplit(strsplit(FILES_sequenza[i], "/")[[1]][10], "[.]")[[1]][1]) - 9)
file_df[,"sample"]<-rep(sample_id,nrow(file_df))
cnv_centrosome_loss_sequenza<-rbind(cnv_centrosome_loss_sequenza, file_df)
}
colnames(cnv_centrosome_loss_sequenza)[1]<-"chrom"
cnv_centrosome_loss_sequenza<-cnv_centrosome_loss_sequenza[,c("chrom", "start.pos", "end.pos", "CNt", "A", "B", "sample")]
centrosome_loss_sequenza_cnv<-readRDS("/xdisk/mpadi/jiawenyang/data/centrosome_loss/wgs/processed/cnv_centrosome_loss_sequenza.rds")
cnv_sequenza_cl_Sigminer<-centrosome_loss_sequenza_cnv[, c(1,2,3,5,7)]
colnames(cnv_sequenza_cl_Sigminer)<-colnames(sigminer_prad_sequenza)cnv_sequenza_cl_Sigminer<-read.table(file = "/xdisk/mpadi/jiawenyang/data/centrosome_loss/wgs/processed/cnv_sequenza_cl_Sigminer.tsv")
cnv_sequenza_cl_Sigminer_cells<-cnv_sequenza_cl_Sigminer[cnv_sequenza_cl_Sigminer$sample %in% c("CN_1_1_CKDN200002992-1A_H57L2DSXY_L1", "CN_6_1_CKDN200002994-1A_H57L2DSXY_L1", "CN_9_1_CKDN200002996-1A_H57L2DSXY_L1"),]ncore = 1
CNV.sequenza_cl_cells<-read_copynumber(input = "/xdisk/mpadi/jiawenyang/data/centrosome_loss/wgs/processed/cnv_sequenza_cl_Sigminer_cells.tsv",
genome_build = "hg38",
complement = FALSE,
verbose = T
)
CNV.sequenza_cl.tally.W_cells<-sig_tally(CNV.sequenza_cl_cells, method = "W", core = ncore, feature_setting = CN.features)
CNV_sequenza_sigminer_matrix_cells<-CNV.sequenza_cl.tally.W_cells$nmf_matrixcombine_sequenza_nmf_matrix<-readRDS("/xdisk/mpadi/jiawenyang/result/centrosome_loss/sigminer/combine_sequenza_centrosome_loss_cells_nmf_matrix.RDS")
load("/xdisk/mpadi/jiawenyang/result/centrosome_loss/sigminer/EST.CNV.sequenza_cl_cells_and_patients.W.all.Rdata")
p_CNV.sequenza_cl_cell_and_PRAD_patients<-show_sig_number_survey(EST.CNV.sequenza_cl_cells_and_patients.W.all, right_y = NULL)
p_CNV.sequenza_cl_cell_and_PRAD_patientsncores = 1
Sig.CNV.seqz_cl_cells_prad_patients <- sig_extract(combine_sequenza_nmf_matrix, n_sig = 3, nrun = 50, cores = ncores)
Sig.CNV.seqz_cl_cells_prad_patients_3_sig_groups<-get_groups(Sig.CNV.seqz_cl_cells_prad_patients)
Sig.CNV.seqz_cl_cells_prad_patients_3_sig_groupsSig.CNV.seqz_cl_cells_prad_patients<-readRDS(file ="/xdisk/mpadi/jiawenyang/result/centrosome_loss/sigminer/Sig.CNV.seqz_cl_cells_prad_patients_3_sig_groups.RDS")
p_CNV.seqz_cl_cells_prad_patients_3_sig_groups<-show_sig_profile(Sig.CNV.seqz_cl_cells_prad_patients,
mode = "copynumber",
style = "cosmic", method = "W", normalize = "feature")
p_CNV.seqz_cl_cells_prad_patients_3_sig_groups##
## Sig1 Sig2 Sig3
## 1 0 0 239
## 2 115 0 0
## 3 193 281 113
seqz_sig1_samples<-Sig.CNV.seqz_cl_cells_prad_patients_3_sig_groups[Sig.CNV.seqz_cl_cells_prad_patients_3_sig_groups$enrich_sig == "Sig1",]
#nrow(seqz_sig1_samples) 115
seqz_sig2_samples<-Sig.CNV.seqz_cl_cells_prad_patients_3_sig_groups[Sig.CNV.seqz_cl_cells_prad_patients_3_sig_groups$enrich_sig == "Sig2",]
#nrow(seqz_sig2_samples) 587
seqz_sig3_samples<-Sig.CNV.seqz_cl_cells_prad_patients_3_sig_groups[Sig.CNV.seqz_cl_cells_prad_patients_3_sig_groups$enrich_sig == "Sig3",]
#nrow(seqz_sig3_samples) 239
Sig.CNV.seqz_cl_cells_prad_patients_3_sig_groups## sample group silhouette_width enrich_sig
## <char> <char> <num> <char>
## 1: WCMC7520-SRR3146865 1 0.985 Sig3
## 2: STID0000002915-SRR477768 1 0.945 Sig3
## 3: WCMC12728-SRR3146836 1 0.985 Sig3
## 4: 6-SRR4043956 1 0.971 Sig3
## 5: 59-SRR4043832 1 0.971 Sig3
## ---
## 937: 01-2382-SRR471433 3 0.861 Sig2
## 938: 00-1823-SRR471613 3 0.886 Sig2
## 939: 00-160-SRR471373 3 0.750 Sig2
## 940: 00-000450-SRR474658 3 0.783 Sig2
## 941: 00-1165-SRR461844 3 0.686 Sig2
Sig.CNV.seqz_cl_cells_prad_patients_3_sig_groups<-readRDS(file ="/xdisk/mpadi/jiawenyang/result/centrosome_loss/sigminer/Sig.CNV.seqz_cl_cells_prad_patients_3_sig_groups.RDS")
clinical_info<-readRDS(file = "/xdisk/mpadi/jiawenyang/data/centrosome_loss/sigminer/PC_CNA_signature/data/PRAD_CLINICAL.rds")
Sig.CNV.seqz_cl_cells_prad_patients_3_sig_groups<-get_groups(Sig.CNV.seqz_cl_cells_prad_patients_3_sig_groups)##
## Sig1 Sig2 Sig3
## 1 0 0 239
## 2 115 0 0
## 3 193 281 113
Sig.CNV.seqz_cl_cells_prad_patients_3_sig_groups[grep("SRR", Sig.CNV.seqz_cl_cells_prad_patients_3_sig_groups$sample), "sample"] <-
unlist(lapply(Sig.CNV.seqz_cl_cells_prad_patients_3_sig_groups$sample[grep("-SRR", Sig.CNV.seqz_cl_cells_prad_patients_3_sig_groups$sample)], function(x) str_extract(x, "SRR\\w+")))
Sig.CNV.seqz_cl_cells_prad_patients_3_sig_groups[,"tumor_Run"]<-Sig.CNV.seqz_cl_cells_prad_patients_3_sig_groups$sample
Sig.CNV.seqz_cl_cells_prad_patients_with_clinical<-dplyr::left_join(Sig.CNV.seqz_cl_cells_prad_patients_3_sig_groups, clinical_info, by = "tumor_Run")
DT::datatable(Sig.CNV.seqz_cl_cells_prad_patients_with_clinical)sample_type_count<-as.data.frame(Sig.CNV.seqz_cl_cells_prad_patients_with_clinical %>% group_by(enrich_sig) %>% dplyr::count(sample_type))
sample_type_count<-na.omit(sample_type_count)
p <- ggplot(data=sample_type_count,
aes(x=factor(enrich_sig, levels = c("Sig1", "Sig2", "Sig3")),
y=n,
fill=sample_type)) +
scale_fill_manual(values=c("#ff7768", "#ffd600","grey")) +
geom_bar(position = "fill", stat="identity", width = 0.5) +
geom_text(aes(label = n), size = 3, hjust = 0.5, vjust = 3, position = "fill") +
xlab("Copy number signature enrichment groups") +
ylab("Percentage of each sample type") +
ggtitle("Sample types within each copy number signature") +
theme_bw () +
theme(title = element_text(face = "bold"),
legend.title = element_text(face = "bold"),
legend.text = element_text(face = "bold"),
axis.text.x = element_text(face = "bold"),
axis.text.y = element_text(face = "bold"),
axis.title.x = element_text(face = "bold"),
strip.text.x = element_blank(),
strip.text.y = element_text(angle = 0, face = "bold", hjust = 0),
#strip.background =element_rect(fill="white"),
legend.position = "top",
panel.spacing = unit(0,'lines'))
psample_type_df<-data.frame("Sig1" = c(72, 43), "Sig2" = c(33, 545), "Sig3" = c(185, 50))
fisher.test(sample_type_df)##
## Fisher's Exact Test for Count Data
##
## data: sample_type_df
## p-value < 2.2e-16
## alternative hypothesis: two.sided
surv_df <- readRDS(file = "/xdisk/mpadi/jiawenyang/data/centrosome_loss/sigminer/PC_CNA_signature/data/PRAD_Survival.rds")
surv_df <- as.data.frame(surv_df)
surv_df$sample[grep("TCGA",surv_df$sample)]<-paste0(surv_df$sample[grep("TCGA",surv_df$sample)], "-01")
df.CNV.seqz_cl_cells_prad_patients<-as.data.frame(t(Sig.CNV.seqz_cl_cells_prad_patients$Exposure))
df.CNV.seqz_cl_cells_prad_patients[,"sample"]<-rownames(df.CNV.seqz_cl_cells_prad_patients)
range01 <- function(x, ...) {
(x - min(x, ...)) / (max(x, ...) - min(x, ...))
}
surv_dt <- dplyr::inner_join(surv_df, df.CNV.seqz_cl_cells_prad_patients ,
by = c("sample"))
#dup_ids = which(duplicated(surv_dt$sample))
# Remove duplicated records if needed
cols_to_sigs.seqz <- c(paste0("Sig", c(1:3)))
surv_dt = surv_dt %>%
dplyr::mutate_at(cols_to_sigs.seqz, ~./10) %>%
dplyr::mutate(OS.time = OS.time / 30.5,
PFI.time = PFI.time / 30.5)
p_os = show_forest(surv_dt,
covariates = cols_to_sigs.seqz,
time = "OS.time", status = "OS", merge_models = TRUE)
p_pfs = show_forest(surv_dt,
covariates = cols_to_sigs.seqz,
time = "PFI.time", status = "PFI", merge_models = TRUE)
p_os#Survival analysis (data downloaded from cbioportal) =================
tcga_clinical_cbioportal<-read.delim("/xdisk/mpadi/jiawenyang/data/centrosome_loss/cbioportal_dataset/prad_tcga_pan_can_atlas_2018/data_clinical_patient.txt", sep = "\t", stringsAsFactors = F, header = 1)
##Clinical data organization==========================
tcga_clinical_cbioportal_dat<-tcga_clinical_cbioportal[,c("X.Patient.Identifier","Overall.Survival..Months.","Overall.Survival.Status", "Disease.Free..Months.","Disease.Free.Status")]
seqz_sig1_patient<-Sig.CNV.seqz_cl_cells_prad_patients_3_sig_groups[Sig.CNV.seqz_cl_cells_prad_patients_3_sig_groups$enrich_sig == "Sig1", ]
seqz_sig1_patient_tcga<-seqz_sig1_patient$sample[grep("TCGA", seqz_sig1_patient$sample)]
seqz_sig2_patient<-Sig.CNV.seqz_cl_cells_prad_patients_3_sig_groups[Sig.CNV.seqz_cl_cells_prad_patients_3_sig_groups$enrich_sig == "Sig2", ]
seqz_sig2_patient_tcga<-seqz_sig2_patient$sample[grep("TCGA", seqz_sig2_patient$sample)]
seqz_sig3_patient<-Sig.CNV.seqz_cl_cells_prad_patients_3_sig_groups[Sig.CNV.seqz_cl_cells_prad_patients_3_sig_groups$enrich_sig == "Sig3", ]
seqz_sig3_patient_tcga<-seqz_sig3_patient$sample[grep("TCGA", seqz_sig3_patient$sample)]
sig1_patient<-gsub("-01", "", seqz_sig1_patient_tcga)
sig2_patient<-gsub("-01", "", seqz_sig2_patient_tcga)
sig3_patient<-gsub("-01", "", seqz_sig3_patient_tcga)
sig1_patient_clinical_data<-tcga_clinical_cbioportal_dat[tcga_clinical_cbioportal_dat$X.Patient.Identifier %in% sig1_patient, ]
sig2_patient_clinical_data<-tcga_clinical_cbioportal_dat[tcga_clinical_cbioportal_dat$X.Patient.Identifier %in% sig2_patient, ]
sig3_patient_clinical_data<-tcga_clinical_cbioportal_dat[tcga_clinical_cbioportal_dat$X.Patient.Identifier %in% sig3_patient, ]
sig1_patient_clinical_data$overall_status<-unlist(lapply(sig1_patient_clinical_data$Overall.Survival.Status, function(x) strsplit(x, ":")[[1]][1]))
sig1_patient_clinical_data$group_status<-rep(0, nrow(sig1_patient_clinical_data))
sig2_patient_clinical_data$overall_status<-unlist(lapply(sig2_patient_clinical_data$Overall.Survival.Status, function(x) strsplit(x, ":")[[1]][1]))
sig2_patient_clinical_data$group_status<-rep(1, nrow(sig2_patient_clinical_data))
sig3_patient_clinical_data$overall_status<-unlist(lapply(sig3_patient_clinical_data$Overall.Survival.Status, function(x) strsplit(x, ":")[[1]][1]))
sig3_patient_clinical_data$group_status<-rep(2, nrow(sig3_patient_clinical_data))
clinical_dat_for_survival_curve<-rbind(sig1_patient_clinical_data, sig2_patient_clinical_data, sig3_patient_clinical_data)
##overall_survival==============================
tcga_overall<-clinical_dat_for_survival_curve[,c("X.Patient.Identifier","Overall.Survival..Months.","overall_status","group_status")]
tcga_overall$Overall.Survival..Months.<-as.numeric(tcga_overall$Overall.Survival..Months.)
tcga_overall$overall_status<-as.numeric(tcga_overall$overall_status)
#Disease_free_survival=========================
tcga_diseasefree<-clinical_dat_for_survival_curve[which(clinical_dat_for_survival_curve$Disease.Free..Months.!=""), ]
tcga_diseasefree$diseasefree_status<-unlist(lapply(tcga_diseasefree$Disease.Free.Status, function(x) strsplit(x, ":")[[1]][1]))
tcga_diseasefree<-tcga_diseasefree[,c("X.Patient.Identifier","Disease.Free..Months.","diseasefree_status","group_status")]
tcga_diseasefree$Disease.Free..Months.<-as.numeric(tcga_diseasefree$Disease.Free..Months.)
tcga_diseasefree$diseasefree_status<-as.numeric(tcga_diseasefree$diseasefree_status)
#Kaplan-meier curve generation========================
overall_S<-Surv(tcga_overall$Overall.Survival..Months., tcga_overall$overall_status)
overall_Sfit<-survfit(Surv(Overall.Survival..Months., overall_status)~group_status, data = tcga_overall)
tcga_prad_overall<-ggsurvplot(overall_Sfit,
conf.int = F,
pval = T,
risk.table = T,
legend.labs = c("CN-sig1", "CN-sig2", "CN-sig3"),
legend.title = "Copy number signature",
xlab = "Time (month)",
title = "Kaplan-Meier Curve for TCGA Prostate cancer patients \n -Overall survival",
palette = c("#050506", "#DE8B36", "#CD4224"),
font.tickslab = c(15),
risk.table.height = .15)
tcga_prad_overalldiseasefree_Sfit<-survfit(Surv(Disease.Free..Months., diseasefree_status)~group_status, data = tcga_diseasefree)
tcga_prad_diseasefree<-ggsurvplot(diseasefree_Sfit,
conf.int = F,
pval = T,
risk.table = T,
legend.labs = c("CN-sig1", "CN-sig2","CN-sig3"),
legend.title = "Copy number signature",
xlab = "Time (month)",
title = "Kaplan-Meier Curve for TCGA Prostate cancer patients \n -Disease-free survival",
palette = c("#050506", "#DE8B36", "#CD4224"),
font.tickslab = c(15),
risk.table.height = .15)
tcga_prad_diseasefree# Associate with gleason score ====================================================
clinical_data<-readRDS("/xdisk/mpadi/jiawenyang/data/centrosome_loss/sigminer/PRAD_CLINICAL.rds")
clinical_data<-clinical_data[clinical_data$Study == "TCGA",]
clinical_data<-as.data.frame(clinical_data)
clinical_dat_df<-clinical_data
#clinical_dat_df$case_submitter_id<-substr(clinical_dat$case_submitter_id, 1, nchar(clinical_dat$case_submitter_id)-4)
sig1_gleason<-clinical_dat_df[clinical_dat_df$PatientID%in% sig1_patient, ]
sig1_gleason[,"CN_sig"]<-rep("CN-sig1", nrow(sig1_gleason))
sig2_gleason<-clinical_dat_df[clinical_dat_df$PatientID %in% sig2_patient, ]
sig2_gleason[,"CN_sig"]<-rep("CN-sig2", nrow(sig2_gleason))
sig3_gleason<-clinical_dat_df[clinical_dat_df$PatientID %in% sig3_patient, ]
sig3_gleason[,"CN_sig"]<-rep("CN-sig3", nrow(sig3_gleason))
gleason_df<-rbind(sig1_gleason, sig2_gleason, sig3_gleason)
# Perform one-way ANOVA
anova_result<-aov(GleasonScore ~ CN_sig, gleason_df)
summary(anova_result)## Df Sum Sq Mean Sq F value Pr(>F)
## CN_sig 2 7.6 3.797 3.763 0.0239 *
## Residuals 496 500.5 1.009
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = GleasonScore ~ CN_sig, data = gleason_df)
##
## $CN_sig
## diff lwr upr p adj
## CN-sig2-CN-sig1 -0.1635097 -0.66853300 0.3415137 0.7270383
## CN-sig3-CN-sig1 0.3214756 -0.31996637 0.9629176 0.4667853
## CN-sig3-CN-sig2 0.4849853 0.05886825 0.9111023 0.0210092
p<-ggplot(data=gleason_df, aes(x=CN_sig, y=GleasonScore , fill=CN_sig, color = CN_sig)) +
theme_light()+#change background
geom_violin(alpha = 0.4)+
geom_jitter(shape = 16, position = position_jitter(0.2)) +
geom_boxplot(width = 0.1, alpha = 0.8)+
scale_fill_manual(values=c(`CN-sig1`="#050506",`CN-sig2`="#DE8B36", `CN-sig3`="#CD4224"))+
scale_color_manual(values=c(`CN-sig1`="#050506",`CN-sig2`="#DE8B36", `CN-sig3`="#CD4224"))+
geom_hline(yintercept = mean(gleason_df$GleasonScore), linetype = 2)+
labs(title="Associate Copy number signatures with PCa Gleason Score", x="CN-signatures", y = "gleason score sum of two sites")+
theme(plot.title = element_text(size=20),
axis.text.x = element_text(size = 10), axis.text.y = element_text(size = 10),
axis.title.x = element_text(size = 15), axis.title.y = element_text(size = 15),
legend.text = element_text(size = 10), legend.title = element_text(size = 15))
plibrary(dplyr)
library(ggpubr)
library(rstatix)
library(ezcox)
TCGA_fusion<-read.table("/xdisk/mpadi/jiawenyang/data/centrosome_loss/cbioportal_dataset/prad_tcga_pan_can_atlas_2018/data_fusions.txt", sep = "\t", quote = "", header = T)
fusion_events<-as.data.frame(table(TCGA_fusion$Tumor_Sample_Barcode))
colnames(fusion_events)<-c("sample", "Freq")
sig1_patients<-readRDS("/xdisk/mpadi/jiawenyang/result/centrosome_loss/sigminer/clinical_association_table/sequenza_cn_sig1_tcga_samples.rds")
sig2_patients<-readRDS("/xdisk/mpadi/jiawenyang/result/centrosome_loss/sigminer/clinical_association_table/sequenza_cn_sig2_tcga_samples.rds")
sig3_patients<-readRDS("/xdisk/mpadi/jiawenyang/result/centrosome_loss/sigminer/clinical_association_table/sequenza_cn_sig3_tcga_samples.rds")
sig_and_patients<-rbind(sig1_patients, sig2_patients, sig3_patients)
colnames(sig_and_patients)[1]<-"sample"
fusion_events<-left_join(fusion_events, sig_and_patients, by = "sample")
fusion_events$enrich_sig<-gsub("Sig", "CN-sig", fusion_events$enrich_sig)
# Perform one-way ANOVA
anova_result<-aov(Freq ~ enrich_sig, fusion_events)
summary(anova_result)## Df Sum Sq Mean Sq F value Pr(>F)
## enrich_sig 2 2242 1121.2 23.28 2.53e-10 ***
## Residuals 428 20615 48.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = Freq ~ enrich_sig, data = fusion_events)
##
## $enrich_sig
## diff lwr upr p adj
## CN-sig2-CN-sig1 -5.715939 -9.295989 -2.135890 0.0005772
## CN-sig3-CN-sig1 1.863636 -2.656954 6.384227 0.5965138
## CN-sig3-CN-sig2 7.579576 4.574187 10.584964 0.0000000
p<-ggplot(data=fusion_events, aes(x=enrich_sig, y=Freq , fill=enrich_sig, color = enrich_sig)) +
theme_light()+#change background
geom_violin(alpha = 0.4)+
geom_jitter(shape = 16, position = position_jitter(0.2)) +
geom_boxplot(width = 0.1, alpha = 0.8)+
scale_fill_manual(values=c(`CN-sig1`="#050506",`CN-sig2`="#DE8B36", `CN-sig3`="#CD4224"))+
scale_color_manual(values=c(`CN-sig1`="#050506",`CN-sig2`="#DE8B36", `CN-sig3`="#CD4224"))+
labs(x="CN-signatures", y = "Number of Fusion Events", title = "Associate Copy Number Signature with fusion events")+
theme(plot.title = element_text(size=20),
axis.text.x = element_text(size = 10), axis.text.y = element_text(size = 10),
axis.title.x = element_text(size = 15), axis.title.y = element_text(size = 15),
legend.text = element_text(size = 10), legend.title = element_text(size = 15))
pcnv_cl_Sigminer_sequenza<-read.table("/xdisk/mpadi/jiawenyang/data/centrosome_loss/wgs/processed/cnv_sequenza_cl_Sigminer.tsv", sep = "\t")
cnv_tumor_Sigminer_sequenza<-read.table(file = "/xdisk/mpadi/jiawenyang/data/centrosome_loss/sigminer/PC_CNA_signature/data/CNV_from_sequenza.tsv", head = T)
cnv_all_Sigminer_sequenza<-rbind(cnv_cl_Sigminer_sequenza, cnv_tumor_Sigminer_sequenza)#To calculate CNAburden for a sample, segments of copy number gains and losses were determined (23), and their total genomic length was summed and calculated as a percentage of the size of the autosomalgenome (exclude chr x and y).
hg38chrlength<-read.table(file = "/xdisk/mpadi/jiawenyang/data/centrosome_loss/wgs/hg38ChromLength.txt")
hg38chrlength$V2<-as.numeric(hg38chrlength$V2)
hg38chrlength$V1<-paste0("chr", hg38chrlength$V1)
hg38chrlength<-hg38chrlength[!hg38chrlength$V1 %in% c("chrX", "chrY"),]
#for sequenza
samples<-unique(cnv_all_Sigminer_sequenza$sample)
chrs<-hg38chrlength$V1
cnv_chr_burden_sequenza<-list()
cnv_all_burden_sequenza<-data.frame()
for (i in 7:length(samples)){
sample_names<-samples[i]
df_ind_sample<-cnv_all_Sigminer_sequenza[cnv_all_Sigminer_sequenza$sample == sample_names, ]
df_ind_sample_autosomal<-df_ind_sample[!df_ind_sample$Chromosome %in% c("chrX", "chrY"),]
df_ind_sample_autosomal<-df_ind_sample_autosomal[df_ind_sample_autosomal$modal_cn != 2,]
df_sample<-data.frame()
for (k in 1:length(chrs)) {
chromosome<-chrs[k]
cnv_region_ind_chr<-df_ind_sample_autosomal[df_ind_sample_autosomal$Chromosome == chromosome,]
cnv_region_ind_chr<-na.omit(cnv_region_ind_chr)
if (nrow(cnv_region_ind_chr) == 0) {chr_burden <- 0} else {
cnv_region_ind_chr_length<-sum(cnv_region_ind_chr$End.bp - cnv_region_ind_chr$Start.bp + 1)
chr_burden<-cnv_region_ind_chr_length/hg38chrlength[hg38chrlength$V1 == chromosome, "V2"]}
df_ind<-data.frame(sample = sample_names,
chr = chromosome,
cnv_region_length = cnv_region_ind_chr_length,
burden = chr_burden)
df_sample<-rbind(df_sample, df_ind)
cnv_all_burden<-sum(df_ind$cnv_region_length)/sum(hg38chrlength$V2)
cnv_all_burden_df<-data.frame(sample = sample_names, burden = cnv_all_burden)
}
cnv_chr_burden_sequenza[[sample_names]]<-df_sample
cnv_all_burden_sequenza<-rbind(cnv_all_burden_sequenza, cnv_all_burden_df)
}cnv_all_burden_sequenza<-read.csv("/xdisk/mpadi/jiawenyang/result/centrosome_loss/cnv_burden/cnv_all_burden_sequenza.csv", row.names = 1)
tcga_cnv_burden<-cnv_all_burden_sequenza[grep("TCGA", cnv_all_burden_sequenza$sample),]#import sigminer classification result
cn_sig1_tcga<-readRDS("/xdisk/mpadi/jiawenyang/result/centrosome_loss/sigminer/clinical_association_table/sequenza_tcga_cn_sig1_tcga_samples.rds")
cn_sig2_tcga<-readRDS("/xdisk/mpadi/jiawenyang/result/centrosome_loss/sigminer/clinical_association_table/sequenza_tcga_cn_sig2_tcga_samples.rds")
cn_sig3_tcga<-readRDS("/xdisk/mpadi/jiawenyang/result/centrosome_loss/sigminer/clinical_association_table/sequenza_tcga_cn_sig3_tcga_samples.rds")
tcga_cnv_burden[tcga_cnv_burden$sample %in% cn_sig1_tcga, "CN_sig"]<-"CN-sig1"
tcga_cnv_burden[tcga_cnv_burden$sample %in% cn_sig2_tcga, "CN_sig"]<-"CN-sig2"
tcga_cnv_burden[tcga_cnv_burden$sample %in% cn_sig3_tcga, "CN_sig"]<-"CN-sig3"
# Perform one-way ANOVA
anova_result<-aov(burden ~ CN_sig, tcga_cnv_burden)
summary(anova_result)## Df Sum Sq Mean Sq F value Pr(>F)
## CN_sig 2 0.000623 3.115e-04 9.283 0.00011 ***
## Residuals 495 0.016613 3.356e-05
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = burden ~ CN_sig, data = tcga_cnv_burden)
##
## $CN_sig
## diff lwr upr p adj
## CN-sig2-CN-sig1 0.0009927497 -0.001919844 0.003905343 0.7024146
## CN-sig3-CN-sig1 0.0053619933 0.001662847 0.009061140 0.0020452
## CN-sig3-CN-sig2 0.0043692435 0.001911667 0.006826821 0.0001023
p<-ggplot(data=tcga_cnv_burden, aes(x=CN_sig, y=burden, fill=CN_sig, color = CN_sig)) +
theme_light()+#change background
geom_violin(alpha = 0.4)+
geom_jitter(shape = 16, position = position_jitter(0.2)) +
geom_boxplot(width = 0.1, alpha = 0.8)+
scale_fill_manual(values=c(`CN-sig1`="#050506",`CN-sig2`="#DE8B36", `CN-sig3`="#CD4224"))+
scale_color_manual(values=c(`CN-sig1`="#050506",`CN-sig2`="#DE8B36", `CN-sig3`="#CD4224"))+
labs(title="Associate Copy number signatures with CNV burden", x="Copy number signature", y = "CNV burden \n (Fraction of autosomal genome)")+
theme(plot.title = element_text(size=20),
axis.text.x = element_text(size = 10), axis.text.y = element_text(size = 10),
axis.title.x = element_text(size = 15), axis.title.y = element_text(size = 15),
legend.text = element_text(size = 10), legend.title = element_text(size = 15))
pload(file = "/xdisk/mpadi/jiawenyang/data/centrosome_loss/sigminer/PC_CNA_signature/output/CNV.facets.RData")
sigminer_facets_cnv<-CNV.facets@data
sigminer_facets_cnv<-as.data.frame(sigminer_facets_cnv)
colnames(sigminer_facets_cnv)<-c("Chromosome", "Start.bp", "End.bp", "modal_cn", "sample")
ncore = 1
sigminer_prad_facets<-read_copynumber(input ="/xdisk/mpadi/jiawenyang/data/centrosome_loss/wgs/processed/cnv_facets_sigminer.tsv",
genome_build = "hg38",
complement = FALSE,
verbose = T)
sigminer_prad_facets.tally.W<-sig_tally(sigminer_prad_facets, method = "W", core = ncore, feature_setting = CN.features)
CNV_facets_sigminer_prad_matrix<-sigminer_prad_facets.tally.W$nmf_matrixcolumns_cnv_SigMatrixExtractor<-c("chrom", "start", "end", "tcn.em", "lcn.em")
columns_cnv_Sigminer<-c("chrom", "start", "end", "tcn.em")
FILES_FACETS_CNV<-list.files(path = "/xdisk/mpadi/jiawenyang/result/centrosome_loss/facets",
pattern = "*.Rdata$",
recursive = T,
full.names = T)
cnv_all_SigMatrixExtractor<-data.frame()
cnv_all_Sigminer<-data.frame()
for (i in 1:length(FILES_FACETS_CNV)){
id<-basename(FILES_FACETS_CNV[i])
id<-strsplit(id, "[.]")[[1]][1]
cn<-get(load(FILES_FACETS_CNV[i]))
cn_SigMatrixExtractor<-cn$cncf[,columns_cnv_SigMatrixExtractor]
cn_Sigminer<-cn$cncf[,columns_cnv_Sigminer]
cn_SigMatrixExtractor$chrom<-paste0("chr", cn_SigMatrixExtractor$chrom)
cn_Sigminer$chrom<-paste0("chr", cn_Sigminer$chrom)
cn_SigMatrixExtractor[,"sample"]<-rep(id, nrow(cn_SigMatrixExtractor))
cn_Sigminer[,"sample"]<-rep(id, nrow(cn_Sigminer))
cnv_all_Sigminer<-rbind(cnv_all_Sigminer, cn_Sigminer)
cnv_all_SigMatrixExtractor<-rbind(cnv_all_SigMatrixExtractor, cn_SigMatrixExtractor)
}cnv_facets_cl_Sigminer<-read.table(file = "/xdisk/mpadi/jiawenyang/data/centrosome_loss/wgs/processed/cnv_facets_cl_Sigminer.tsv")
cnv_facets_cl_Sigminer_cells<-cnv_facets_cl_Sigminer[cnv_facets_cl_Sigminer$sample %in% c("CN_1_1_CKDN200002992-1A_H57L2DSXY_L1", "CN_6_1_CKDN200002994-1A_H57L2DSXY_L1", "CN_9_1_CKDN200002996-1A_H57L2DSXY_L1"),]load("/xdisk/mpadi/jiawenyang/result/centrosome_loss/sigminer/EST.CNV.facets_cl_cells_and_patients.W.all.Rdata")
p_CNV.facets_cl_cell_and_PRAD_patients<-show_sig_number_survey(EST.CNV.facets_cl_cells_and_patients.W.all, right_y = NULL)
p_CNV.facets_cl_cell_and_PRAD_patientscombine_facets_nmf_matrix<-readRDS(file = "/xdisk/mpadi/jiawenyang/result/centrosome_loss/sigminer/combine_facets_centrosome_loss_cells_nmf_matrix")
Sig.CNV.facets_cl_cells_prad_patients <- sig_extract(combine_facets_nmf_matrix, n_sig = 3, nrun = 50, cores = ncores)
Sig.CNV.facets_cl_cells_prad_patients_3_sig<-Sig.CNV.facets_cl_cells_prad_patientsSig.CNV.facets_cl_cells_prad_patients_3_sig<-readRDS("/xdisk/mpadi/jiawenyang/result/centrosome_loss/sigminer/Sig.CNV.facets_cl_cells_prad_patients_3_sig.RDS")
Sig.CNV.facets_cl_cells_prad_patients_3_sig_groups<-get_groups(Sig.CNV.facets_cl_cells_prad_patients_3_sig)##
## Sig1 Sig2 Sig3
## 1 166 0 5
## 2 0 0 162
## 3 23 298 282
p_CNV.facets_cl_cells_prad_patients_3_sig_groups<-show_sig_profile(Sig.CNV.facets_cl_cells_prad_patients_3_sig,
mode = "copynumber",
style = "cosmic", method = "W", normalize = "feature")
print(p_CNV.facets_cl_cells_prad_patients_3_sig_groups)facets_sig1_samples<-Sig.CNV.facets_cl_cells_prad_patients_3_sig_groups[Sig.CNV.facets_cl_cells_prad_patients_3_sig_groups$enrich_sig == "Sig1",]
#nrow(facets_sig1_samples) 171 samples
facets_sig2_samples<-Sig.CNV.facets_cl_cells_prad_patients_3_sig_groups[Sig.CNV.facets_cl_cells_prad_patients_3_sig_groups$enrich_sig == "Sig2",]
#nrow(facets_sig2_samples) 603 samples
facets_sig3_samples<-Sig.CNV.facets_cl_cells_prad_patients_3_sig_groups[Sig.CNV.facets_cl_cells_prad_patients_3_sig_groups$enrich_sig == "Sig3",]
#nrow(facets_sig3_samples) 162 samples
facets_sig3_tcga<-facets_sig3_samples[grep("TCGA",facets_sig3_samples$sample),]
facets_sig2_tcga<-facets_sig2_samples[grep("TCGA", facets_sig2_samples$sample),]
facets_sig1_tcga<-facets_sig1_samples[grep("TCGA", facets_sig1_samples$sample),] surv_df <- readRDS(file = "/xdisk/mpadi/jiawenyang/data/centrosome_loss/sigminer/PC_CNA_signature/data/PRAD_Survival.rds")
surv_df <- as.data.frame(surv_df)
df.CNV.facets_cl_cells_prad_patients<-as.data.frame(t(Sig.CNV.facets_cl_cells_prad_patients_3_sig$Exposure))
df.CNV.facets_cl_cells_prad_patients[,"sample"]<-rownames(df.CNV.facets_cl_cells_prad_patients)
range01 <- function(x, ...) {
(x - min(x, ...)) / (max(x, ...) - min(x, ...))
}
surv_dt <- dplyr::inner_join(surv_df, df.CNV.facets_cl_cells_prad_patients ,
by = c("sample"))
#dup_ids = which(duplicated(surv_df$sample))
# Remove duplicated records if needed
cols_to_sigs.facets<- c(paste0("Sig", c(1:3)))
surv_dt = surv_dt %>%
dplyr::mutate_at(cols_to_sigs.facets, ~./10) %>%
dplyr::mutate(OS.time = OS.time / 30.5,
PFI.time = PFI.time / 30.5)
p_os = show_forest(surv_dt,
covariates = cols_to_sigs.facets,
time = "OS.time", status = "OS", merge_models = TRUE)
p_pfs = show_forest(surv_dt,
covariates = cols_to_sigs.facets,
time = "PFI.time", status = "PFI", merge_models = TRUE)
p_os#Survival analysis (data downloaded from cbioportal) =================
tcga_clinical_cbioportal<-read.delim("/xdisk/mpadi/jiawenyang/data/centrosome_loss/cbioportal_dataset/prad_tcga_pan_can_atlas_2018/data_clinical_patient.txt", sep = "\t", stringsAsFactors = F, header = 1)
##Clinical data organization==========================
tcga_clinical_cbioportal_dat<-tcga_clinical_cbioportal[,c("X.Patient.Identifier","Overall.Survival..Months.","Overall.Survival.Status", "Disease.Free..Months.","Disease.Free.Status")]
facets_sig1_patient<-Sig.CNV.facets_cl_cells_prad_patients_3_sig_groups[Sig.CNV.facets_cl_cells_prad_patients_3_sig_groups$enrich_sig == "Sig1", ]
facets_sig1_patient_tcga<-facets_sig1_patient$sample[grep("TCGA", facets_sig1_patient$sample)]
facets_sig2_patient<-Sig.CNV.facets_cl_cells_prad_patients_3_sig_groups[Sig.CNV.facets_cl_cells_prad_patients_3_sig_groups$enrich_sig == "Sig2", ]
facets_sig2_patient_tcga<-facets_sig2_patient$sample[grep("TCGA", facets_sig2_patient$sample)]
facets_sig3_patient<-Sig.CNV.facets_cl_cells_prad_patients_3_sig_groups[Sig.CNV.facets_cl_cells_prad_patients_3_sig_groups$enrich_sig == "Sig3", ]
facets_sig3_patient_tcga<-facets_sig3_patient$sample[grep("TCGA", facets_sig3_patient$sample)]
sig1_patient<-facets_sig1_patient_tcga
sig2_patient<-facets_sig2_patient_tcga
sig3_patient<-facets_sig3_patient_tcga
sig1_patient_clinical_data<-tcga_clinical_cbioportal_dat[tcga_clinical_cbioportal_dat$X.Patient.Identifier %in% sig1_patient, ]
sig2_patient_clinical_data<-tcga_clinical_cbioportal_dat[tcga_clinical_cbioportal_dat$X.Patient.Identifier %in% sig2_patient, ]
sig3_patient_clinical_data<-tcga_clinical_cbioportal_dat[tcga_clinical_cbioportal_dat$X.Patient.Identifier %in% sig3_patient, ]
sig1_patient_clinical_data$overall_status<-unlist(lapply(sig1_patient_clinical_data$Overall.Survival.Status, function(x) strsplit(x, ":")[[1]][1]))
sig1_patient_clinical_data$group_status<-rep(0, nrow(sig1_patient_clinical_data))
sig2_patient_clinical_data$overall_status<-unlist(lapply(sig2_patient_clinical_data$Overall.Survival.Status, function(x) strsplit(x, ":")[[1]][1]))
sig2_patient_clinical_data$group_status<-rep(1, nrow(sig2_patient_clinical_data))
sig3_patient_clinical_data$overall_status<-unlist(lapply(sig3_patient_clinical_data$Overall.Survival.Status, function(x) strsplit(x, ":")[[1]][1]))
sig3_patient_clinical_data$group_status<-rep(2, nrow(sig3_patient_clinical_data))
clinical_dat_for_survival_curve<-rbind(sig1_patient_clinical_data, sig2_patient_clinical_data, sig3_patient_clinical_data)
##overall_survival==============================
tcga_overall<-clinical_dat_for_survival_curve[,c("X.Patient.Identifier","Overall.Survival..Months.","overall_status","group_status")]
tcga_overall$Overall.Survival..Months.<-as.numeric(tcga_overall$Overall.Survival..Months.)
tcga_overall$overall_status<-as.numeric(tcga_overall$overall_status)
#Disease_free_survival=========================
tcga_diseasefree<-clinical_dat_for_survival_curve[which(clinical_dat_for_survival_curve$Disease.Free..Months.!=""), ]
tcga_diseasefree$diseasefree_status<-unlist(lapply(tcga_diseasefree$Disease.Free.Status, function(x) strsplit(x, ":")[[1]][1]))
tcga_diseasefree<-tcga_diseasefree[,c("X.Patient.Identifier","Disease.Free..Months.","diseasefree_status","group_status")]
tcga_diseasefree$Disease.Free..Months.<-as.numeric(tcga_diseasefree$Disease.Free..Months.)
tcga_diseasefree$diseasefree_status<-as.numeric(tcga_diseasefree$diseasefree_status)
#Kaplan-meier curve generation========================
overall_S<-Surv(tcga_overall$Overall.Survival..Months., tcga_overall$overall_status)
overall_Sfit<-survfit(Surv(Overall.Survival..Months., overall_status)~group_status, data = tcga_overall)
tcga_prad_overall<-ggsurvplot(overall_Sfit,
conf.int = F,
pval = T,
risk.table = T,
legend.labs = c("CN-sig1", "CN-sig2", "CN-sig3"),
legend.title = "Copy number signature",
xlab = "Time (month)",
title = "Kaplan-Meier Curve for TCGA Prostate cancer patients-Overall survival",
palette = c("#5BC0EB", "#050506", "#6600CC"),
font.tickslab = c(15),
risk.table.height = .15)
tcga_prad_overalldiseasefree_Sfit<-survfit(Surv(Disease.Free..Months., diseasefree_status)~group_status, data = tcga_diseasefree)
tcga_prad_diseasefree<-ggsurvplot(diseasefree_Sfit,
conf.int = F,
pval = T,
risk.table = T,
legend.labs = c("CN-sig1", "CN-sig2", "CN-sig3"),
legend.title = "Copy number signature",
xlab = "Time (month)",
title = "Kaplan-Meier Curve for TCGA Prostate cancer patients-Disease-free survival",
palette = c("#5BC0EB", "#050506", "#6600CC"),
font.tickslab = c(15),
risk.table.height = .15)
tcga_prad_diseasefreeclinical_data<-readRDS("/xdisk/mpadi/jiawenyang/data/centrosome_loss/sigminer/PRAD_CLINICAL.rds")
clinical_data<-clinical_data[clinical_data$Study == "TCGA",]
clinical_data<-as.data.frame(clinical_data)
clinical_dat_df<-clinical_data
sig1_gleason<-clinical_dat_df[clinical_dat_df$PatientID%in% sig1_patient, ]
sig1_gleason[,"CN_sig"]<-rep("CN-sig1", nrow(sig1_gleason))
sig2_gleason<-clinical_dat_df[clinical_dat_df$PatientID %in% sig2_patient, ]
sig2_gleason[,"CN_sig"]<-rep("CN-sig2", nrow(sig2_gleason))
sig3_gleason<-clinical_dat_df[clinical_dat_df$PatientID %in% sig3_patient, ]
sig3_gleason[,"CN_sig"]<-rep("CN-sig3", nrow(sig3_gleason))
gleason_df<-rbind(sig1_gleason, sig2_gleason, sig3_gleason)
# Perform one-way ANOVA
anova_result<-aov(GleasonScore ~ CN_sig, gleason_df)
summary(anova_result)## Df Sum Sq Mean Sq F value Pr(>F)
## CN_sig 2 8.4 4.186 4.162 0.0161 *
## Residuals 495 497.8 1.006
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = GleasonScore ~ CN_sig, data = gleason_df)
##
## $CN_sig
## diff lwr upr p adj
## CN-sig2-CN-sig1 -0.2477522 -0.77464061 0.2791361 0.5111849
## CN-sig3-CN-sig1 0.1696429 -0.44715794 0.7864436 0.7944055
## CN-sig3-CN-sig2 0.4173951 0.05858838 0.7762018 0.0177437
p<-ggplot(data=gleason_df, aes(x=CN_sig, y=GleasonScore , fill=CN_sig, color = CN_sig)) +
theme_light()+#change background
geom_violin(alpha = 0.4)+
geom_jitter(shape = 16, position = position_jitter(0.2)) +
geom_boxplot(width = 0.1, alpha = 0.8)+
scale_fill_manual(values=c(`CN-sig1`= "#5BC0EB",`CN-sig2`="#050506", `CN-sig3`="#6600CC"))+
scale_color_manual(values=c(`CN-sig1`="#5BC0EB",`CN-sig2`="#050506", `CN-sig3`="#6600CC"))+
geom_hline(yintercept = mean(gleason_df$GleasonScore), linetype = 2)+
labs(title="Associate Copy number signatures with PCa Gleason Score", x="Copy number signature", y = "gleason score")+
theme(plot.title = element_text(size=20),
axis.text.x = element_text(size = 10), axis.text.y = element_text(size = 10),
axis.title.x = element_text(size = 15), axis.title.y = element_text(size = 15),
legend.text = element_text(size = 10), legend.title = element_text(size = 15))
pcnv_all_Sigminer_facet<-read.table(file = "/xdisk/mpadi/jiawenyang/data/centrosome_loss/wgs/processed/cnv_facets_all.tsv", sep = "\t")#To calculate CNAburden for a sample, segments of copy number gains and losses were determined (23), and their total genomic length was summed and calculated as a percentage of the size of the autosomalgenome (exclude chr x and y).
cnv_all_Sigminer_facet<-cnv_all_Sigminer_facet[!cnv_all_Sigminer_facet$sample %in% c("CN2_2b_CKDN200005067-1A_H5T7VDSXY_L1", "CN9R1_1_CKDN190048205-1A_HV3TFDSXX_L1", "CN9R1_2_CKDN190048207-1A_HTC3FDSXX_L2", "CN9R2_1_CKDN190048209-1A_HTC3FDSXX_L2", "CN9line2_1_CKDN210017617-1A_H23WLDSX3_L2", "CN9line7_1_CKDN210017619-1A_H23WLDSX3_L2"),]
hg38chrlength<-read.table(file = "/xdisk/mpadi/jiawenyang/data/centrosome_loss/wgs/hg38ChromLength.txt")
hg38chrlength$V2<-as.numeric(hg38chrlength$V2)
hg38chrlength$V1<-paste0("chr", hg38chrlength$V1)
hg38chrlength<-hg38chrlength[!hg38chrlength$V1 %in% c("chrX", "chrY"),]
#for facets
samples<-unique(cnv_all_Sigminer_facet$sample)
chrs<-hg38chrlength$V1
cnv_chr_burden_facets<-list()
cnv_all_burden_facets<-data.frame()
for (i in 1:length(samples)){
sample_names<-samples[i]
df_ind_sample<-cnv_all_Sigminer_facet[cnv_all_Sigminer_facet$sample == sample_names, ]
df_ind_sample_autosomal<-df_ind_sample[!df_ind_sample$Chromosome %in% c("chrX", "chrY"),]
df_ind_sample_autosomal<-df_ind_sample_autosomal[df_ind_sample_autosomal$modal_cn != 2,]
#print(sample_names)
df_sample<-data.frame()
for (k in 1:length(chrs)) {
chromosome<-chrs[k]
cnv_region_ind_chr<-df_ind_sample_autosomal[df_ind_sample_autosomal$Chromosome == chromosome,]
cnv_region_ind_chr<-na.omit(cnv_region_ind_chr)
if (nrow(cnv_region_ind_chr) == 0) {chr_burden <- 0} else {
cnv_region_ind_chr_length<-sum(cnv_region_ind_chr$End.bp - cnv_region_ind_chr$Start.bp + 1)
chr_burden<-cnv_region_ind_chr_length/hg38chrlength[hg38chrlength$V1 == chromosome, "V2"]}
df_ind<-data.frame(sample = sample_names,
chr = chromosome,
cnv_region_length = cnv_region_ind_chr_length,
burden = chr_burden)
df_sample<-rbind(df_sample, df_ind)
cnv_all_burden<-sum(df_ind$cnv_region_length)/sum(hg38chrlength$V2)
cnv_all_burden_df<-data.frame(sample = sample_names, burden = cnv_all_burden)
}
cnv_chr_burden_facets[[sample_names]]<-df_sample
cnv_all_burden_facets<-rbind(cnv_all_burden_facets, cnv_all_burden_df)
}cnv_all_burden_facets<-read.csv("/xdisk/mpadi/jiawenyang/result/centrosome_loss/cnv_burden/cnv_all_burden_facets.csv", row.names = 1)
tcga_cnv_burden_facets<-cnv_all_burden_facets[grep("TCGA", cnv_all_burden_facets$sample),]#import sigminer classification result
facets_cn_sig1_tcga<-readRDS("/xdisk/mpadi/jiawenyang/result/centrosome_loss/sigminer/clinical_association_table/facets_tcga_cn_sig1_samples.rds")
facets_cn_sig2_tcga<-readRDS("/xdisk/mpadi/jiawenyang/result/centrosome_loss/sigminer/clinical_association_table/facets_tcga_cn_sig2_samples.rds")
facets_cn_sig3_tcga<-readRDS("/xdisk/mpadi/jiawenyang/result/centrosome_loss/sigminer/clinical_association_table/facets_tcga_cn_sig3_samples.rds")
tcga_cnv_burden_facets[tcga_cnv_burden_facets$sample %in% facets_cn_sig1_tcga, "CN_sig"]<-"CN-sig1"
tcga_cnv_burden_facets[tcga_cnv_burden_facets$sample %in% facets_cn_sig2_tcga, "CN_sig"]<-"CN-sig2"
tcga_cnv_burden_facets[tcga_cnv_burden_facets$sample %in% facets_cn_sig3_tcga, "CN_sig"]<-"CN-sig3"
# Perform one-way ANOVA
anova_result<-aov(burden ~ CN_sig, tcga_cnv_burden_facets)
summary(anova_result)## Df Sum Sq Mean Sq F value Pr(>F)
## CN_sig 2 0.000146 7.306e-05 1.243 0.29
## Residuals 494 0.029045 5.879e-05
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = burden ~ CN_sig, data = tcga_cnv_burden_facets)
##
## $CN_sig
## diff lwr upr p adj
## CN-sig2-CN-sig1 -0.001444501 -0.005473300 0.002584298 0.6765280
## CN-sig3-CN-sig1 -0.002928254 -0.007644302 0.001787793 0.3113833
## CN-sig3-CN-sig2 -0.001483753 -0.004227506 0.001259999 0.4121289
p<-ggplot(data=tcga_cnv_burden_facets, aes(x=CN_sig, y=burden, fill=CN_sig, color=CN_sig)) +
theme_light()+#change background
geom_violin(alpha = 0.4)+
geom_jitter(shape = 16, position = position_jitter(0.2)) +
geom_boxplot(width = 0.1, alpha = 0.8)+
scale_fill_manual(values=c(`CN-sig1`="#5A7ECB",`CN-sig2`="#5BC0EB", `CN-sig3`="#6600CC"))+
scale_color_manual(values=c(`CN-sig1`="#5A7ECB",`CN-sig2`="#5BC0EB", `CN-sig3`="#6600CC"))+
labs(title="Associate Copy number signatures with CNV burden", x="Copy number signature", y = "CNV burden \n (Fraction of autosomal genome)")+
theme(plot.title = element_text(size=20),
axis.text.x = element_text(size = 10), axis.text.y = element_text(size = 10),
axis.title.x = element_text(size = 15), axis.title.y = element_text(size = 15),
legend.text = element_text(size = 10), legend.title = element_text(size = 15))
pquery<-GDCquery(project = "TCGA-PRAD",
data.category = "Copy Number Variation",
data.type = "Masked Copy Number Segment")
GDCdownload(query, method = "api")
PRAD_CNV_download<-GDCprepare(query = query, save = T, save.filename = "PRAD_CNV_download.rda")A = load("/xdisk/mpadi/jiawenyang/bin/gistic2.0/PRAD_CNV_download.rda")
tumorCNV<-eval(parse(text = A))
tumorCNV = tumorCNV[,2:7]
tumorCNV = tumorCNV[, c("Sample", "Chromosome", "Start", "End", "Num_Probes", "Segment_Mean")]
#Find 01A
tumorCNV[,"type"]<-unlist(lapply(tumorCNV$Sample, function(x) strsplit(x, "-")[[1]][4]))
tumorCNV = tumorCNV[tumorCNV$type == "01A",]
tumorCNV = tumorCNV[, 1:6]marker<-read.table("/xdisk/mpadi/jiawenyang/src/gistic/snp6.na35.remap.hg38.subset.txt", header = T)
marker = marker[marker$freqcnv == "FALSE",]
marker = marker[ ,c(1:3)]
colnames(marker) = c("Marker Name", "Chromosome", "Marker Position")tumorCNV[,"TCGA_ID"]<-unlist(lapply(tumorCNV$Sample, function(x) substr(x, 1, 12)))
sequenza_cn_signature<-read.delim2("/xdisk/mpadi/jiawenyang/result/centrosome_loss/sigminer/clinical_association_table/sequenz_clinical_dat_for_survival.csv", sep = ",", row.names = 2)
CN_sig1_seg<-tumorCNV[tumorCNV$TCGA_ID %in% rownames(sequenza_cn_signature[sequenza_cn_signature$group_status == "0",]), ]
CN_sig1_seg = CN_sig1_seg[,1:6]
CN_sig2_seg<-tumorCNV[tumorCNV$TCGA_ID %in% rownames(sequenza_cn_signature[sequenza_cn_signature$group_status == "1",]), ]
CN_sig2_seg = CN_sig2_seg[,1:6]
CN_sig3_seg<-tumorCNV[tumorCNV$TCGA_ID %in% rownames(sequenza_cn_signature[sequenza_cn_signature$group_status == "2",]), ]
CN_sig3_seg = CN_sig3_seg[,1:6]prad.gistic.cn_sig3<-readGistic(gisticAllLesionsFile = "/xdisk/mpadi/jiawenyang/result/centrosome_loss/GISTIC2.0/seqz_CN_sig3_TCGA/all_lesions.conf_90.txt",
gisticAmpGenesFile = "/xdisk/mpadi/jiawenyang/result/centrosome_loss/GISTIC2.0/seqz_CN_sig3_TCGA/amp_genes.conf_90.txt",
gisticDelGenesFile = "/xdisk/mpadi/jiawenyang/result/centrosome_loss/GISTIC2.0/seqz_CN_sig3_TCGA/del_genes.conf_90.txt",
gisticScoresFile = "/xdisk/mpadi/jiawenyang/result/centrosome_loss/GISTIC2.0/seqz_CN_sig3_TCGA/scores.gistic",
isTCGA = T
)## -Processing Gistic files..
## --Processing amp_genes.conf_90.txt
## --Processing del_genes.conf_90.txt
## --Processing scores.gistic
## --Summarizing by samples
prad.gistic.cn_sig2<-readGistic(gisticAllLesionsFile = "/xdisk/mpadi/jiawenyang/result/centrosome_loss/GISTIC2.0/seqz_CN_sig2_TCGA/all_lesions.conf_90.txt",
gisticAmpGenesFile = "/xdisk/mpadi/jiawenyang/result/centrosome_loss/GISTIC2.0/seqz_CN_sig2_TCGA/amp_genes.conf_90.txt",
gisticDelGenesFile = "/xdisk/mpadi/jiawenyang/result/centrosome_loss/GISTIC2.0/seqz_CN_sig2_TCGA/del_genes.conf_90.txt",
gisticScoresFile = "/xdisk/mpadi/jiawenyang/result/centrosome_loss/GISTIC2.0/seqz_CN_sig2_TCGA/scores.gistic",
isTCGA = T
)## -Processing Gistic files..
## --Processing amp_genes.conf_90.txt
## --Processing del_genes.conf_90.txt
## --Processing scores.gistic
## --Summarizing by samples
prad.gistic.cn_sig1<-readGistic(gisticAllLesionsFile = "/xdisk/mpadi/jiawenyang/result/centrosome_loss/GISTIC2.0/seqz_CN_sig1_TCGA/all_lesions.conf_90.txt",
gisticAmpGenesFile = "/xdisk/mpadi/jiawenyang/result/centrosome_loss/GISTIC2.0/seqz_CN_sig1_TCGA/amp_genes.conf_90.txt",
gisticDelGenesFile = "/xdisk/mpadi/jiawenyang/result/centrosome_loss/GISTIC2.0/seqz_CN_sig1_TCGA/del_genes.conf_90.txt",
gisticScoresFile = "/xdisk/mpadi/jiawenyang/result/centrosome_loss/GISTIC2.0/seqz_CN_sig1_TCGA/scores.gistic",
isTCGA = T
)## -Processing Gistic files..
## --Processing amp_genes.conf_90.txt
## --Processing del_genes.conf_90.txt
## --Processing scores.gistic
## --Summarizing by samples
prad.gistic.all<-readGistic(gisticAllLesionsFile = "/xdisk/mpadi/jiawenyang/result/centrosome_loss/GISTIC2.0/TCGA_PRAD/all_lesions.conf_90.txt",
gisticAmpGenesFile = "/xdisk/mpadi/jiawenyang/result/centrosome_loss/GISTIC2.0/TCGA_PRAD/amp_genes.conf_90.txt",
gisticDelGenesFile = "/xdisk/mpadi/jiawenyang/result/centrosome_loss/GISTIC2.0/TCGA_PRAD/del_genes.conf_90.txt",
gisticScoresFile =
"/xdisk/mpadi/jiawenyang/result/centrosome_loss/GISTIC2.0/TCGA_PRAD/scores.gistic",
isTCGA = T)## -Processing Gistic files..
## --Processing amp_genes.conf_90.txt
## --Processing del_genes.conf_90.txt
## --Processing scores.gistic
## --Summarizing by samples
## [1] "TCGA PRAD CN-signature 3 samples"
## [1] "TCGA PRAD CN-signature 2 samples"
## [1] "TCGA PRAD CN-signature 1 samples"
## [1] "TCGA PRAD all samples"
library(DT)
cn3_gistic_cnv_rank<-read.table("/xdisk/mpadi/jiawenyang/result/centrosome_loss/GISTIC2.0/seqz_CN_sig3_TCGA/all_lesions.conf_90.txt", sep = "\t", quote = "", header = T)
cn3_gistic_cnv_rank<-cn3_gistic_cnv_rank[order(cn3_gistic_cnv_rank$q.values, decreasing = F),]
cn3_gistic_cnv_rank<-cn3_gistic_cnv_rank[!duplicated(cn3_gistic_cnv_rank$Descriptor),]
datatable(cn3_gistic_cnv_rank)library(ggbio)
library(RCircos)
library(GenomicRanges)
library(biovizBase)
cn_sig3_cnv_amp<-read.table(file = "/xdisk/mpadi/jiawenyang/result/centrosome_loss/GISTIC2.0/seqz_CN_sig3_TCGA/amp_genes.conf_90.txt", sep = "\t", quote = "", header = T)
cn_sig3_cnv_del<-read.table(file = "/xdisk/mpadi/jiawenyang/result/centrosome_loss/GISTIC2.0/seqz_CN_sig3_TCGA/del_genes.conf_90.txt", sep = "\t", quote = "", header = T)
cn_sig3_TCGA_regions<-data.frame(seqnames = unlist(c(cn_sig3_cnv_amp[3, 2:ncol(cn_sig3_cnv_amp)], c(cn_sig3_cnv_del[3, 2:ncol(cn_sig3_cnv_del)]))),
cytobands = c(colnames(cn_sig3_cnv_amp)[2:ncol(cn_sig3_cnv_amp)],colnames(cn_sig3_cnv_del)[2:ncol(cn_sig3_cnv_del)]),
cnv_state=c(rep("gain", ncol(cn_sig3_cnv_amp)-1), rep("loss", ncol(cn_sig3_cnv_del)-1)),
q_value = unlist(c(cn_sig3_cnv_amp[2, c(2:ncol(cn_sig3_cnv_amp))], cn_sig3_cnv_del[2, c(2:ncol(cn_sig3_cnv_del))])))
cn_sig3_TCGA_regions<-na.omit(cn_sig3_TCGA_regions)
cn_sig3_TCGA_regions$q_value<-as.numeric(cn_sig3_TCGA_regions$q_value)
cn_sig3_TCGA_regions<-cn_sig3_TCGA_regions[cn_sig3_TCGA_regions$q_value <= 0.1,] #only include those q-value less than 0.1 (less strick cut-off)
cn_sig3_TCGA_regions<-cn_sig3_TCGA_regions[, c(1:3)]
cn_sig3_TCGA_regions[, "chromosome"]<-unlist(lapply(cn_sig3_TCGA_regions$seqnames, function(x) strsplit(x, ":")[[1]][1]))
cn_sig3_TCGA_regions[, "start"]<-unlist(lapply(unlist(lapply(cn_sig3_TCGA_regions$seqnames, function(x) strsplit(x, ":")[[1]][2])), function(x) strsplit(x, "-")[[1]][1]))
cn_sig3_TCGA_regions[, "end"]<-unlist(lapply(unlist(lapply(cn_sig3_TCGA_regions$seqnames, function(x) strsplit(x, ":")[[1]][2])), function(x) strsplit(x, "-")[[1]][2]))
cn_sig3_TCGA_regions[, "cytobands"]<-unlist(lapply(cn_sig3_TCGA_regions$cytobands, function(x) substr(x, 2, nchar(x))))
cn_sig3_TCGA_regions<-cn_sig3_TCGA_regions[, c("chromosome", "start", "end", "cytobands", "cnv_state")]
cn_sig3_TCGA_regions<-makeGRangesFromDataFrame(cn_sig3_TCGA_regions, keep.extra.columns = T)
load("/xdisk/mpadi/jiawenyang/result/centrosome_loss/cnvkit_wgs/xenograft_tumor_consistent_regions_granges.Rdata")
cn_centrosome_loss_regions<-c(olps_gain, olps_loss)
common_loss_region<-findOverlaps(cn_sig3_TCGA_regions[cn_sig3_TCGA_regions$cnv_state == "loss"], cn_centrosome_loss_regions[cn_centrosome_loss_regions$cnv_state == "loss"])
common_gain_region<-findOverlaps(cn_sig3_TCGA_regions[cn_sig3_TCGA_regions$cnv_state == "gain"], cn_centrosome_loss_regions[cn_centrosome_loss_regions$cnv_state == "gain"])
common_cnvs<-c(pintersect(cn_sig3_TCGA_regions[cn_sig3_TCGA_regions$cnv_state == "gain"][queryHits(common_gain_region),],
cn_centrosome_loss_regions[cn_centrosome_loss_regions$cnv_state == "gain"][subjectHits(common_gain_region)]),
pintersect(cn_sig3_TCGA_regions[cn_sig3_TCGA_regions$cnv_state == "loss"][queryHits(common_loss_region),],
cn_centrosome_loss_regions[cn_centrosome_loss_regions$cnv_state == "loss"][subjectHits(common_loss_region)]))
common_cnvs<-common_cnvs[!duplicated(common_cnvs$cytobands),]
seqlengths(common_cnvs)<-seqlengths(ideoCyto$hg19)[names(seqlengths(common_cnvs))]
data("UCSC.HG38.Human.CytoBandIdeogram")
data(ideoCyto, package = "biovizBase")
colnames(UCSC.HG38.Human.CytoBandIdeogram)<-c("Chromosome", "start", "end", "name", "gieStain")
cyto.info<-makeGRangesFromDataFrame(UCSC.HG38.Human.CytoBandIdeogram,keep.extra.columns = T)
p= autoplot(cyto.info[seqnames(cyto.info) %in% seqnames(common_cnvs)], layout = "karyogram", cytobands = T ) +
layout_karyogram(data = common_cnvs, aes(color = factor(cnv_state, levels = c("gain", "loss")), fill = cnv_state), size = 0.8, geom = "rect") +
labs(colour = "cnv_state") +
scale_color_manual(labels = c("gain", "loss"), values = c("orange", "purple")) +
scale_fill_manual(labels = c("gneg", "gpos100", "gpos25", "gpos50", "gpos75", "gvar", "stalk", "acen","gain","loss"), values = c(c("#f9f9f9", "#474747", "#cecece", "#a0a0a0", "#737373", "#474747", "#d36c6c", "#8b2323"), alpha(c("#cc0000", "#3399ff"), 0.6)))
print(p)#Protein coding gene annotation
ah <- AnnotationHub::AnnotationHub()
ahDb <- AnnotationHub::query(ah, pattern = c("Homo sapiens", "EnsDb"))
ahEdb <- ahDb[["AH109606"]]
bt.genes<-ensembldb::genes(ahEdb)
GenomeInfoDb::seqlevelsStyle(bt.genes)<-"UCSC"
pc.genes<-subset(bt.genes, gene_biotype == "protein_coding")
seqlengths(common_cnvs)<-seqlengths(pc.genes)[names(seqlengths(common_cnvs))]
#Annotate cnv covered genes
olpas <- findOverlaps(pc.genes, common_cnvs, ignore.strand=TRUE)
qh <- S4Vectors::queryHits(olpas)
sh <- S4Vectors::subjectHits(olpas)
covered_genes<-pc.genes[qh,"symbol"]
regions<-common_cnvs[sh,"cytobands"]
df<-cbind(data.frame(covered_genes), data.frame(regions))
DT::datatable(df[ ,c(1:6, 12)])